#define patchSide 8
#define patchSideSh 3
#define side_2_sh 6
#define side_2 (patchSide * patchSide)

#define N 16
#define NSHIFT 4

#define PATCHSHIFT 3
#define PATCHSIZE (1<<PATCHSHIFT)

__constant float kaiser_window[side_2] =
{
    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f,
    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
    0.432500f, 0.671700f, 0.864400f, 0.971800f, 0.971800f, 0.864400f, 0.671700f, 0.432500f,
    0.384600f, 0.597400f, 0.768800f, 0.864400f, 0.864400f, 0.768800f, 0.597400f, 0.384600f,
    0.298900f, 0.464200f, 0.597400f, 0.671700f, 0.671700f, 0.597400f, 0.464200f, 0.298900f,
    0.192400f, 0.298900f, 0.384600f, 0.432500f, 0.432500f, 0.384600f, 0.298900f, 0.192400f
};

#define WGS_W 8
#define WGS_H 8

__kernel __attribute__((reqd_work_group_size(WGS_W, WGS_H, 1)))
void apply_stack(
                __global unsigned*    _w_ind,
                __global unsigned*    _h_ind,
                int                   _w_ind_size,
                int                   _h_ind_size,
                int                   w_ind_size,
                int                   h_ind_size,
                __global unsigned*    offsets,
                __global float*       stacks,
                __global float*       weights,
                int                   width,
                int                   height,
                __global float*       numerator,
                __global float*       denominator)
{
    const int _ind_i = get_global_id(0);
    const int _ind_j = get_global_id(1);

    if(_ind_i >= _w_ind_size)
        return;
    
    if(_ind_j >= _h_ind_size)
        return;
    
    const unsigned ind_i = _w_ind[_ind_i];
    const unsigned ind_j = _h_ind[_ind_j];
    
    const unsigned ind_offset = ind_j * w_ind_size + ind_i;
    __global unsigned* _offsets = &offsets[ind_offset<<NSHIFT];

    const int offsetOrg = _offsets[0];
    
    const int stackSize = side_2<<NSHIFT;
    __global float* stack = &stacks[ind_offset*stackSize];
    
    float weight = weights[ind_offset];
    
    const int windowSize = patchSide*3;
    const int windowSize2 = windowSize*windowSize;
    
    for (unsigned n = 0; n < N; n++)
    {
        const unsigned patchOffset = _offsets[n];
        for (unsigned p = 0; p < patchSide; p++)
        {
            for (unsigned q = 0; q < patchSide; q++)
            {
                const int sideOffs = (p << patchSideSh) + q;
                const unsigned ind = patchOffset + p * width + q;
                float w = weight * kaiser_window[sideOffs];
                numerator[ind] += w * stack[n*side_2 + sideOffs];
                denominator[ind] += w;
            }
        }
    }
}
